{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.model_selection import cross_validate\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "from sklearn import svm\n",
    "from sklearn import neighbors\n",
    "from sklearn import naive_bayes\n",
    "from sklearn.svm import LinearSVC\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.gaussian_process import GaussianProcessClassifier\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from sklearn.externals import joblib\n",
    "\n",
    "%config InlineBackend.figure_format = 'svg'\n",
    "%matplotlib inline\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"fliter_train_data.csv\")\n",
    "safe_type = data[\"safe_type\"]\n",
    "features = data.iloc[:, 2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data, test_data, train_label, test_label = train_test_split(features, \n",
    "                                                                  safe_type, \n",
    "                                                                  test_size=0.2, \n",
    "                                                                  random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot(test_label, y_pred, model):\n",
    "    font = {\"color\": \"darkred\",\n",
    "            \"size\": 13, \n",
    "            \"family\" : \"serif\"}\n",
    "\n",
    "    accs = accuracy_score(test_label, y_pred)\n",
    "    fpr, tpr, _ = metrics.roc_curve(test_label,  y_pred)\n",
    "    auc = metrics.roc_auc_score(test_label, y_pred)\n",
    "    plt.style.use(\"fivethirtyeight\")\n",
    "    fig, ax = plt.subplots()\n",
    "    ax.plot(fpr, tpr, label=\"{}, auc=\".format(model)+str(auc), color='green', linewidth=2)\n",
    "    ax.set_title(\"ROC curve\", fontdict=font)\n",
    "    leg = ax.legend(loc=\"best\")\n",
    "    text = leg.get_texts()\n",
    "    _ = plt.setp(text, color=\"blue\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier()               \n",
    "model.fit(train_data, train_label)            \n",
    "y_pred = model.predict(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The best parameter for BaggingClassifier is {'max_samples': 0.5, 'n_estimators': 300, 'random_state': 0} with a runtime of 1259.84 seconds.\n",
      "The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 300, 'random_state': 0} with a runtime of 2480.75 seconds.\n",
      "The best parameter for LogisticRegression is {'fit_intercept': False, 'random_state': 0, 'solver': 'newton-cg'} with a runtime of 300.54 seconds.\n",
      "The best parameter for BernoulliNB is {'alpha': 0.1} with a runtime of 1.86 seconds.\n",
      "The best parameter for KNeighborsClassifier is {} with a runtime of 67.43 seconds.\n",
      "The best parameter for XGBClassifier is {'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'} with a runtime of 771.57 seconds.\n",
      "Total optimization time was 81.37 minutes.\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "grid_n_estimator = [10, 50, 100, 300]\n",
    "grid_ratio = [0.1, 0.25, 0.5, 0.75, 1.0]\n",
    "grid_learn = [0.01, 0.03, 0.05, 0.1, 0.25]\n",
    "grid_max_depth = [2, 4, 6, 8, 10, None]\n",
    "grid_min_samples = [5, 10, 0.03, 0.05, 0.10]\n",
    "grid_criterion = ['gini', 'entropy']\n",
    "grid_bool = [True, False]\n",
    "grid_seed = [0]\n",
    "\n",
    "layer_1 = [\n",
    "            #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html\n",
    "#             ('ada', AdaBoostClassifier()),\n",
    "            ('bc', BaggingClassifier()),\n",
    "#             ('etc', ExtraTreesClassifier()),\n",
    "            ('gbc', GradientBoostingClassifier()),\n",
    "#             ('rfc', RandomForestClassifier()),\n",
    "\n",
    "            #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc\n",
    "#             ('gpc', GaussianProcessClassifier()),\n",
    "\n",
    "            #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
    "            ('lr', LogisticRegression()),\n",
    "\n",
    "            #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html\n",
    "            ('bnb', naive_bayes.BernoulliNB()),\n",
    "#             ('gnb', naive_bayes.GaussianNB()),\n",
    "\n",
    "            #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html\n",
    "            ('knn', neighbors.KNeighborsClassifier()),\n",
    "\n",
    "            #SVM: http://scikit-learn.org/stable/modules/svm.html\n",
    "#             ('svc', svm.SVC(probability=True)),\n",
    "\n",
    "            #xgboost: http://xgboost.readthedocs.io/en/latest/model.html\n",
    "           ('xgb', XGBClassifier())\n",
    "\n",
    "          ]\n",
    "\n",
    "grid_param = [\n",
    "#                 [{\n",
    "#                 #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html\n",
    "#                 'n_estimators': grid_n_estimator, #default=50\n",
    "#                 'learning_rate': grid_learn, #default=1\n",
    "#                 #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R\n",
    "#                 'random_state': grid_seed\n",
    "#                 }],\n",
    "\n",
    "\n",
    "                [{\n",
    "                #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier\n",
    "                'n_estimators': grid_n_estimator, #default=10\n",
    "                'max_samples': grid_ratio, #default=1.0\n",
    "                'random_state': grid_seed\n",
    "                 }],\n",
    "\n",
    "\n",
    "#                 [{\n",
    "#                 #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier\n",
    "#                 'n_estimators': grid_n_estimator, #default=10\n",
    "#                 'criterion': grid_criterion, #default=”gini”\n",
    "#                 'max_depth': grid_max_depth, #default=None\n",
    "#                 'random_state': grid_seed\n",
    "#                  }],\n",
    "\n",
    "\n",
    "                [{\n",
    "                #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier\n",
    "                #'loss': ['deviance', 'exponential'], #default=’deviance’\n",
    "                'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.\n",
    "                'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.\n",
    "                #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”\n",
    "                'max_depth': grid_max_depth, #default=3   \n",
    "                'random_state': grid_seed\n",
    "                 }],\n",
    "\n",
    "\n",
    "#                 [{\n",
    "#                 #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier\n",
    "#                 'n_estimators': grid_n_estimator, #default=10\n",
    "#                 'criterion': grid_criterion, #default=”gini”\n",
    "#                 'max_depth': grid_max_depth, #default=None\n",
    "#                 'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.\n",
    "#                 'random_state': grid_seed\n",
    "#                  }],\n",
    "\n",
    "#                 [{    \n",
    "#                 #GaussianProcessClassifier\n",
    "#                 'max_iter_predict': grid_n_estimator, #default: 100\n",
    "#                 'random_state': grid_seed\n",
    "#                 }],\n",
    "\n",
    "\n",
    "                [{\n",
    "                #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV\n",
    "                'fit_intercept': grid_bool, #default: True\n",
    "                #'penalty': ['l1','l2'],\n",
    "                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs\n",
    "                'random_state': grid_seed\n",
    "                 }],\n",
    "\n",
    "\n",
    "                [{\n",
    "                #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB\n",
    "                'alpha': grid_ratio, #default: 1.0\n",
    "                 }],\n",
    "\n",
    "\n",
    "                #GaussianNB - \n",
    "                [{}],\n",
    "\n",
    "                [{\n",
    "                #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier\n",
    "                'n_neighbors': [1,2,3,4,5,6,7], #default: 5\n",
    "                'weights': ['uniform', 'distance'], #default = ‘uniform’\n",
    "                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']\n",
    "                }],\n",
    "\n",
    "\n",
    "#                 [{\n",
    "#                 #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC\n",
    "#                 #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r\n",
    "#                 #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n",
    "#                 'C': [1,2,3,4,5], #default=1.0\n",
    "#                 'gamma': grid_ratio, #edfault: auto\n",
    "#                 'decision_function_shape': ['ovo', 'ovr'], #default:ovr\n",
    "#                 'probability': [True],\n",
    "#                 'random_state': grid_seed\n",
    "#                  }],\n",
    "\n",
    "\n",
    "                [{\n",
    "                #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html\n",
    "                'learning_rate': grid_learn, #default: .3\n",
    "                'max_depth': [1,2,4,6,8,10], #default 2\n",
    "                'n_estimators': grid_n_estimator, \n",
    "                'seed': grid_seed  \n",
    "                 }]   \n",
    "            ]\n",
    "\n",
    "\n",
    "\n",
    "start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter\n",
    "for clf, param in zip (layer_1, grid_param): #https://docs.python.org/3/library/functions.html#zip\n",
    "\n",
    "    #print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm\n",
    "    #print(param)\n",
    "    \n",
    "    \n",
    "    start = time.perf_counter()        \n",
    "    best_search = GridSearchCV(estimator = clf[1], param_grid = param, cv = 5, scoring = 'roc_auc')\n",
    "    best_search.fit(features, safe_type)\n",
    "    run = time.perf_counter() - start\n",
    "\n",
    "    best_param = best_search.best_params_\n",
    "    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))\n",
    "    clf[1].set_params(**best_param) \n",
    "\n",
    "\n",
    "run_total = time.perf_counter() - start_total\n",
    "print('Total optimization time was {:.2f} minutes.'.format(run_total/60))\n",
    "print('-'*10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "layer_1 = [\n",
    "            #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html\n",
    "#             ('ada', AdaBoostClassifier()),\n",
    "            ('bc', BaggingClassifier()),\n",
    "#             ('etc', ExtraTreesClassifier()),\n",
    "            ('gbc', GradientBoostingClassifier()),\n",
    "#             ('rfc', RandomForestClassifier()),\n",
    "\n",
    "            #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc\n",
    "#             ('gpc', GaussianProcessClassifier()),\n",
    "\n",
    "            #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
    "            ('lr', LogisticRegression()),\n",
    "\n",
    "            #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html\n",
    "            ('bnb', naive_bayes.BernoulliNB()),\n",
    "#             ('gnb', naive_bayes.GaussianNB()),\n",
    "\n",
    "            #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html\n",
    "            ('knn', neighbors.KNeighborsClassifier()),\n",
    "\n",
    "            #SVM: http://scikit-learn.org/stable/modules/svm.html\n",
    "            ('svc', svm.SVC(probability=True)),\n",
    "\n",
    "            #xgboost: http://xgboost.readthedocs.io/en/latest/model.html\n",
    "           ('xgb', XGBClassifier())\n",
    "\n",
    "          ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Soft Voting Training w/bin score mean: 98.43\n",
      "Soft Voting Test w/bin score mean: 97.14\n",
      "Soft Voting Test w/bin score 3*std: +/- 0.68\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')\n",
    "vote_soft_cv = cross_validate(vote_soft, features, safe_type, cv=5)\n",
    "vote_soft.fit(features, safe_type)\n",
    "\n",
    "print(\"Soft Voting Training w/bin score mean: {:.2f}\".format(vote_soft_cv['train_score'].mean()*100)) \n",
    "print(\"Soft Voting Test w/bin score mean: {:.2f}\".format(vote_soft_cv['test_score'].mean()*100))\n",
    "print(\"Soft Voting Test w/bin score 3*std: +/- {:.2f}\".format(vote_soft_cv['test_score'].std()*100*3))\n",
    "print('-'*10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Soft Voting Training w/bin score mean: 98.79\n",
      "Soft Voting Test w/bin score mean: 97.85\n",
      "Soft Voting Test w/bin score 3*std: +/- 0.46\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "gv_vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')\n",
    "gv_vote_soft_cv = cross_validate(gv_vote_soft, features, safe_type, cv=5)\n",
    "gv_vote_soft.fit(features, safe_type)\n",
    "\n",
    "print(\"Soft Voting Training w/bin score mean: {:.2f}\".format(gv_vote_soft_cv['train_score'].mean()*100)) \n",
    "print(\"Soft Voting Test w/bin score mean: {:.2f}\".format(gv_vote_soft_cv['test_score'].mean()*100))\n",
    "print(\"Soft Voting Test w/bin score 3*std: +/- {:.2f}\".format(gv_vote_soft_cv['test_score'].std()*100*3))\n",
    "print('-'*10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['gv_vote_soft.m']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(gv_vote_soft, \"gv_vote_soft.m\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv(\"fliter_test_data.csv\")\n",
    "id_ = test[\"id\"]\n",
    "test_features = test.iloc[:, 1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "predict = gv_vote_soft.predict(test_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.DataFrame()\n",
    "result[\"id\"] = id_\n",
    "result[\"safe_type\"] = predict\n",
    "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "bc = joblib.load(\"./models/bc_gr_model.m\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import glob\n",
    "def read_data(file_type):\n",
    "    data = []\n",
    "    for path in glob.glob(\"./stage1_dataset/train/{}/*\".format(file_type)):\n",
    "        with open(path, \"r\") as fp:\n",
    "            data.append(fp.read())\n",
    "    return data\n",
    "\n",
    "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=3000)\n",
    "white_data = read_data(\"white\")\n",
    "black_data = read_data(\"black\")\n",
    "data = white_data + black_data\n",
    "white = [0 for _ in range(len(white_data))]\n",
    "black = [1 for _ in range(len(black_data))]\n",
    "safe_type = white + black\n",
    "features = vectorizer.fit_transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_oof(model, x_train, y_train, x_test, n_splits):\n",
    "    \"\"\"\n",
    "    :@param x_train: feature matrix.\n",
    "    :type x: np.array(M X N) or list(M X N).\n",
    "    :@param y_train: class label.\n",
    "    :type y: int.\n",
    "    :@param x_test: test set feature matrix.\n",
    "    :type x_test: np.array(M X N) or list(M X N).\n",
    "    :@param n_splits: K-fold parameter.\n",
    "    :type n_splits: int.\n",
    "    \"\"\"\n",
    "    n_train, n_test = x_train.shape[0], x_test.shape[0]\n",
    "    kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n",
    "    oof_train = np.empty((n_train, ))\n",
    "    oof_test = np.empty((n_test, ))\n",
    "    oof_test_skf = np.empty((n_splits, n_test))\n",
    "    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n",
    "        kf_x_train = x_train[train_index]\n",
    "        kf_y_train = y_train[train_index]\n",
    "        kf_x_test = x_train[test_index]\n",
    "        model.fit(kf_x_train, kf_y_train)\n",
    "        oof_train[test_index] = model.predict(kf_x_test)\n",
    "        oof_test_skf[i, :] = model.predict(x_test)\n",
    "    oof_test[:] = oof_test_skf.mean(axis=0)\n",
    "    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from PIL import Image\n",
    "import binascii\n",
    "\n",
    "def getMatrixfrom_bin(filename, width):\n",
    "    with open(filename, 'rb') as f:\n",
    "        content = f.read()\n",
    "    hexst = binascii.hexlify(content)  #将二进制文件转换为十六进制字符串\n",
    "    fh = np.array([int(hexst[i: i+2], 16) for i in range(0, len(hexst), 2)])  #按字节分割\n",
    "    rn = len(fh) // width\n",
    "    fh = np.reshape(fh[:rn * width], (-1, width))  #根据设定的宽度生成矩阵\n",
    "    fh = np.uint8(fh)\n",
    "    return fh\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"./pandalearning.exe\"\n",
    "im = Image.fromarray(getMatrixfrom_bin(filename, 512)) #转换为图像\n",
    "# im.save(\"your_img_filename.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "im.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pefile\n",
    "PEfile_Path = \"pandalearning.exe\"\n",
    "pe = pefile.PE(PEfile_Path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"test.txt\", \"w\") as fp:\n",
    "    fp.write(str(pe))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from collections import *\n",
    "# 从.asm文件获取Opcode序列\n",
    "def getOpcodeSequence(filename):\n",
    "    opcode_seq = []\n",
    "    p = re.compile(r'\\s([a-fA-F0-9]{2}\\s)+\\s*([a-z]+)')\n",
    "    with open(filename) as f:\n",
    "        for line in f:\n",
    "            if line.startswith(\".text\"):\n",
    "                m = re.findall(p, line)\n",
    "                if m:\n",
    "                    opc = m[0][10]\n",
    "                    if opc != \"align\":\n",
    "                        opcode_seq.append(opc)\n",
    "    return opcode_seq\n",
    "# 根据Opcode序列，统计对应的n-gram\n",
    "def getOpcodeNgram(ops ,n = 3):\n",
    "    opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]\n",
    "    opngram = Counter(opngramlist)\n",
    "    return opngram\n",
    "file = \"test.txt\"\n",
    "ops = getOpcodeSequence(file)\n",
    "opngram = getOpcodeNgram(ops)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = str(pe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
